{"cells":[{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":40046,"status":"ok","timestamp":1700117093209,"user":{"displayName":"Marcel Garz","userId":"11492880614359963597"},"user_tz":-60},"id":"VzKzjJJb542a","outputId":"4cf2e7cb-d9e7-4d07-8e6a-b4b0e4a019d2"},"outputs":[],"source":["%pip install sentence_transformers\n","\n","import numpy as np\n","import pandas as pd\n","from sentence_transformers import SentenceTransformer, util"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"Oowd_7-Snsxc"},"outputs":[],"source":["model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"PST_OT9j7tvT"},"outputs":[],"source":["df = pd.read_csv('~/manifestos/corpus_with_vars.csv')\n","\n","# compute cosine only for non-empty statements in relevant categories\n","df = df.loc[(df['category'] != \"other\")]\n","df = df.loc[(df['text'] != \".\")]"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"EKRF3yNx3aUn"},"outputs":[],"source":["# create joint id to split corpus in chunks (to save memory)\n","df[\"date\"] = df[\"date\"].astype(str)\n","df[\"category\"] = df[\"category\"].astype(str)\n","df[\"country\"] = df[\"country\"].astype(str)\n","df[\"party\"] = df[\"party\"].astype(str)\n","df[\"pos\"] = df[\"pos\"].astype(str)\n","df[\"all_id\"] = df[\"party\"] + \"|\" + df[\"date\"] + \"|\" + df[\"category\"] + \"|\" + df[\"pos\"]\n","\n","df[\"chunk_id\"] = df[\"country\"] + \"|\" + df[\"date\"] + \"|\" + df[\"category\"]"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"IcxXdeP1pjOI"},"outputs":[],"source":["# loop over chunks to save memory\n","id_variable = 'chunk_id'\n","unique_ids = df[id_variable].unique()\n","\n","result_list = []  # Empty list to store processed DataFrames\n","result_list2 = []  # Empty list to store processed DataFrames\n","\n","for unique_id in unique_ids:\n","\n","    df_chunk = df[df[id_variable] == unique_id]\n","\n","    # Compute embeddings\n","    embeddings = df_chunk[\"text\"].values.tolist()\n","    embeddings = model.encode(embeddings, convert_to_tensor=True)\n","\n","    # Compute cosine-similarities for each manifesto\n","    cos = util.cos_sim(embeddings, embeddings)\n","\n","    # Convert cosine_scores tensor to DataFrame\n","    cos = cos.to(\"cpu\")\n","    cos = pd.DataFrame(cos.numpy())\n","\n","    # Use combined IDs as column headers\n","    new_columns = dict(zip(cos, df_chunk[\"all_id\"]))\n","    cos = cos.rename(index=str, columns=new_columns)\n","\n","    # Merge IDs with cosine similarity matrix\n","    ids = df_chunk[[\"party\", \"date\", \"category\", \"pos\"]]\n","    df_chunk = pd.concat([ids.reset_index(drop=True), cos.reset_index(drop=True)], axis=1)\n","\n","    # Reshape from wide to long\n","    df_chunk = pd.melt(df_chunk, id_vars=[\"party\", \"date\", \"category\", \"pos\"], var_name=\"all_id\", value_name=\"cosine\")\n","\n","    # split combined IDs\n","    df_chunk[[\"party2\", \"date2\", \"category2\", \"pos2\"]] = df_chunk[\"all_id\"].str.split(\"|\", expand=True)\n","\n","    # drop self-comparisons\n","    df_chunk = df_chunk[(df_chunk['party'] != df_chunk['party2'])]\n","\n","    df_chunk2 = df_chunk.groupby([\"date\",\"category\", \"party\", \"party2\"], as_index=False).agg({\"cosine\":\"mean\"})\n","\n","    result_list2.append(df_chunk2)\n","\n","# Concatenate the resulting DataFrames and save on disk\n","result_agg = pd.concat(result_list2, ignore_index=True)\n","result_agg.to_csv('~/manifestos/party_level_similarities.csv', sep=',', index=False, encoding='utf-8')"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"Z4nNem_2Cbl4"},"outputs":[],"source":["# read necessary files to assemble analysis dataset\n","\n","# corpus for treatment variables and emotion words\n","df = pd.read_csv('~/manifestos/corpus_with_vars.csv')\n","\n","# cosine similarity\n","collapsed_party = pd.read_csv('~/manifestos/party_level_similarities.csv')\n","collapsed_party.rename(columns = {'party':'party1'}, inplace = True)\n","\n","# Manifesto Main Dataset for controls\n","controls = pd.read_stata('~/manifestos/MPDataset_MPDS2023a_stata14.dta', convert_categoricals=False)\n","controls = controls[[\"countryname\", \"party\", \"partyname\", \"date\", \"parfam\", \"pervote\", \"rile\", \"per503\", \\\n","                     \"per410\", \"per416\", \"per504\", \"per505\", \"per5041\", \\\n","                     \"per506\", \"per5061\", \"per507\", \"per607\", \"per6071\", \"per6072\", \"per608\", \"per6081\"]]\n","controls[\"party\"] = controls[\"party\"].astype(int)\n","controls[\"date\"] = controls[\"date\"].astype(int)\n","\n","controls['rile_country_mean'] = controls.groupby(['countryname'])['rile'].transform('mean')\n","\n","# info on presidents / prime ministers\n","leaders = pd.read_excel('~/manifestos/leaders.xls')\n","leaders = leaders[[\"date\", \"party\", \"incumbent_leader\"]]\n","\n","# reshape cosine score from long to wide (category dimension)\n","collapsed_party = pd.pivot(collapsed_party, index=[\"date\", \"party1\", \"party2\"], columns='category', values='cosine')\n","collapsed_party = collapsed_party.rename(columns = {'multicult':'cos_multicult', 'combo':'cos_combo'})\n","collapsed_party = collapsed_party.reset_index()\n","\n","# corpus: collapse to party level and create \"party1\" and \"party2\"\n","df = df.groupby([\"country\", \"date\", \"party\"], as_index=False).agg({\"pop_treat_left\":\"max\", \\\n","                                                                       \"pop_treat_right\":\"max\"})\n","\n","df = df\n","df = df.rename(columns = {'party':'party1', 'pop_treat_right':'pop_treat_right1', \\\n","                          'pop_treat_left':'pop_treat_left1'})\n","\n","df2 = df\n","df2 = df2.rename(columns = {'party1':'party2', 'pop_treat_right1':'pop_treat_right2', \\\n","                          'pop_treat_left1':'pop_treat_left2'})\n","\n","df3 = pd.merge(df, df2, on=[\"country\", \"date\"], how=\"right\")\n","df3 = df3[(df3['party1'] != df3['party2'])]\n","\n","# create position balances\n","controls[\"multicult\"] = controls[\"per607\"] - controls[\"per608\"]\n","controls[\"combo\"] = controls[\"per504\"] + controls[\"per506\"] + controls[\"per503\"] - controls[\"per505\"] - controls[\"per507\"]\n","\n","controls = controls.drop([ \"per410\", \"per416\", \"per504\", \"per505\", \"per5041\", \\\n","                          \"per506\", \"per5061\", \"per507\", \"per607\", \"per6071\", \"per6072\", \"per608\", \"per6081\", \"per503\"], axis=1)\n","\n","# controls for \"party1\" and \"party2\" observations\n","controls2 = controls\n","controls2 = controls2.rename(columns = {'party':'party2', 'partyname':'partyname2', 'parfam':'parfam2', 'pervote':'pervote2', \\\n","                            'rile':'rile2', 'multicult':'multicult2', 'combo':'combo2', \\\n","                            'miss_multicult':'miss_multicult2', 'miss_combo':'miss_combo2'})\n","\n","controls = controls.rename(columns = {'party':'party1', 'partyname':'partyname1', 'parfam':'parfam1', 'pervote':'pervote1', \\\n","                            'rile':'rile1', 'multicult':'multicult1', 'combo':'combo1', \\\n","                            'miss_multicult':'miss_multicult1', 'miss_combo':'miss_combo1'})\n","controls = controls.drop([\"countryname\", \"rile_country_mean\"], axis=1)\n","\n","# leader info for \"party1\" and \"party2\" observations\n","leaders2 = leaders\n","leaders2 = leaders2.rename(columns = {'party':'party2', 'incumbent_leader':'incumbent_leader2'})\n","\n","leaders = leaders.rename(columns = {'party':'party1', 'incumbent_leader':'incumbent_leader1'})"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"Rj_Uyou3wMlh"},"outputs":[],"source":["# create indicators for missing categories\n","miss = pd.read_csv('~/manifestos/corpus_with_vars.csv')\n","\n","miss = miss.loc[(miss['category'] != \"other\")]\n","\n","miss = miss.groupby([\"date\", \"party\", \"category\"], as_index=False).agg({\"pos\":\"max\", \\\n","                                                                       \"text\":\"first\"})\n","\n","miss[\"miss\"] = np.where((miss['pos'] == 1) & (miss['text'] == \".\"), 1, 0)\n","\n","miss = pd.pivot(miss, index=[\"date\", \"party\"], columns='category', values='miss')\n","\n","miss = miss.rename(columns = {'multicult':'miss_multicult', 'combo':'miss_combo'})\n","\n","miss = miss.reset_index()\n","\n","miss2 = miss\n","miss2 = miss2.rename(columns = {'party':'party2', 'miss_multicult':'miss_multicult2', 'miss_combo':'miss_combo2'})\n","\n","miss = miss.rename(columns = {'party':'party1', 'miss_multicult':'miss_multicult1', 'miss_combo':'miss_combo1'})"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"H2us-0_5xnES"},"outputs":[],"source":["# merge everything\n","merged = pd.merge(collapsed_party, df3, on=[\"date\", \"party1\", \"party2\"], how=\"right\")\n","\n","merged = pd.merge(merged, controls, on=[\"date\", \"party1\"], how=\"left\")\n","merged = pd.merge(merged, controls2, on=[\"date\", \"party2\"], how=\"left\")\n","\n","merged = pd.merge(merged, leaders, on=[\"date\", \"party1\"], how=\"left\")\n","merged = pd.merge(merged, leaders2, on=[\"date\", \"party2\"], how=\"left\")\n","\n","merged = pd.merge(merged, miss, on=[\"date\", \"party1\"], how=\"left\")\n","merged = pd.merge(merged, miss2, on=[\"date\", \"party2\"], how=\"left\")\n","\n","# differences in positions\n","merged[\"diff_multicult\"] = abs(merged[\"multicult1\"] - merged[\"multicult2\"])\n","merged[\"diff_combo\"] = abs(merged[\"combo1\"] - merged[\"combo2\"])"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":526,"status":"ok","timestamp":1700117458961,"user":{"displayName":"Marcel Garz","userId":"11492880614359963597"},"user_tz":-60},"id":"-D4NyPaZxpwX","outputId":"76a67582-0266-4c61-ef96-c0c13a64afe7"},"outputs":[],"source":["# generate year\n","merged[\"year\"] = merged[\"date\"] / 100\n","merged[\"year\"] = merged[\"year\"].astype(int)\n","\n","# drop Switzerland (because by design no clear head of state)\n","merged = merged.loc[(merged['countryname'] != \"Switzerland\")]\n","\n","# replace incumbent_leader = 0 if missing\n","merged[\"incumbent_leader1\"] = merged[\"incumbent_leader1\"].fillna(0)\n","merged[\"incumbent_leader2\"] = merged[\"incumbent_leader2\"].fillna(0)\n","\n","# replace incumbent_leader = 0 if populist\n","merged[\"incumbent_leader1\"] = np.where((merged['pop_treat_right1'] == 1) | \\\n","                                 (merged['pop_treat_left1'] == 1), \\\n","                                 0, merged['incumbent_leader1'])\n","merged[\"incumbent_leader2\"] = np.where((merged['pop_treat_right2'] == 1) | \\\n","                                 (merged['pop_treat_left2'] == 1), \\\n","                                 0, merged['incumbent_leader2'])"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"N-7utm7DrYUT"},"outputs":[],"source":["# drop duplicate dyads\n","# goal = compare parties without state leadership (\"party1\") to:\n","#        - other parties without incumbent leader\n","#        - other parties with non-populist incumbent leader\n","#        - other parties with right-wing populist leader\n","#        - other parties with left-wing populist leader\n","\n","merged[\"date\"] = merged[\"date\"].astype(str)\n","merged[\"party1\"] = merged[\"party1\"].astype(str)\n","merged[\"party2\"] = merged[\"party2\"].astype(str)\n","merged[\"all_id1\"] = merged[\"date\"] + \"|\" + merged[\"party1\"]\n","merged[\"all_id2\"] = merged[\"date\"] + \"|\" + merged[\"party2\"]\n","\n","set1 = merged.loc[(merged['pop_treat_right1'] == 0) & \\\n","                  (merged['pop_treat_left1'] == 0) & \\\n","                  (merged['incumbent_leader1'] == 0) & \\\n","                  (merged['pop_treat_right2'] == 1) & \\\n","                  (merged['pop_treat_left2'] == 0) & \\\n","                  (merged['incumbent_leader2'] == 0)]\n","\n","set2 = merged.loc[(merged['pop_treat_right1'] == 0) & \\\n","                  (merged['pop_treat_left1'] == 0) & \\\n","                  (merged['incumbent_leader1'] == 0) & \\\n","                  (merged['pop_treat_right2'] == 0) & \\\n","                  (merged['pop_treat_left2'] == 1) & \\\n","                  (merged['incumbent_leader2'] == 0)]\n","\n","set3 = merged.loc[(merged['pop_treat_right1'] == 0) & \\\n","                  (merged['pop_treat_left1'] == 0) & \\\n","                  (merged['incumbent_leader1'] == 0) & \\\n","                  (merged['pop_treat_right2'] == 0) & \\\n","                  (merged['pop_treat_left2'] == 0) & \\\n","                  (merged['incumbent_leader2'] == 1)]\n","\n","set4 = merged.loc[(merged['pop_treat_right1'] == 0) & \\\n","                  (merged['pop_treat_left1'] == 0) & \\\n","                  (merged['incumbent_leader1'] == 0) & \\\n","                  (merged['pop_treat_right2'] == 1) & \\\n","                  (merged['pop_treat_left2'] == 0) & \\\n","                  (merged['incumbent_leader2'] == 1)]\n","\n","set5 = merged.loc[(merged['pop_treat_right1'] == 0) & \\\n","                  (merged['pop_treat_left1'] == 0) & \\\n","                  (merged['incumbent_leader1'] == 0) & \\\n","                  (merged['pop_treat_right2'] == 0) & \\\n","                  (merged['pop_treat_left2'] == 1) & \\\n","                  (merged['incumbent_leader2'] == 1)]\n","\n","set6 = merged.loc[(merged['pop_treat_right1'] == 0) & \\\n","                  (merged['pop_treat_left1'] == 0) & \\\n","                  (merged['incumbent_leader1'] == 1) & \\\n","                  (merged['pop_treat_right2'] == 1) & \\\n","                  (merged['pop_treat_left2'] == 0) & \\\n","                  (merged['incumbent_leader2'] == 0)]\n","\n","set7 = merged.loc[(merged['pop_treat_right1'] == 0) & \\\n","                  (merged['pop_treat_left1'] == 0) & \\\n","                  (merged['incumbent_leader1'] == 1) & \\\n","                  (merged['pop_treat_right2'] == 0) & \\\n","                  (merged['pop_treat_left2'] == 1) & \\\n","                  (merged['incumbent_leader2'] == 0)]\n","\n","set8 = merged.loc[(merged['pop_treat_right1'] == 0) & \\\n","                  (merged['pop_treat_left1'] == 0) & \\\n","                  (merged['incumbent_leader1'] == 0) & \\\n","                  (merged['pop_treat_right2'] == 0) & \\\n","                  (merged['pop_treat_left2'] == 0) & \\\n","                  (merged['incumbent_leader2'] == 0)]\n","set8 = set8.loc[pd.DataFrame(np.sort(set8[['all_id1','all_id2']],1),index=set8.index).drop_duplicates(keep='first').index]\n","\n","merged = pd.concat([set1, set2, set3, set4, set5, set6, set7, set8])\n","merged = merged.sort_values(by=['all_id1'])"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"Gd8TuYnWGnUL"},"outputs":[],"source":["# create type of party variable\n","\n","merged[\"party_type1\"] = \"any party, not incumbent leader\"\n","merged[\"party_type2\"] = \" \"\n","\n","# other mainstream not incumbent leader\n","merged[\"party_type2\"] = np.where((merged['pop_treat_right2'] == 0) & \\\n","                                 (merged['pop_treat_left2'] == 0) & \\\n","                                 (merged['incumbent_leader2'] == 0), \\\n","                                 \"any party, not incumbent leader\", merged['party_type2'])\n","\n","# other mainstream and incumbent leader\n","merged[\"party_type2\"] = np.where((merged['pop_treat_right2'] == 0) & \\\n","                                 (merged['pop_treat_left2'] == 0) & \\\n","                                 (merged['incumbent_leader2'] == 1), \\\n","                                 \"party with non-populist incumbent leader\", merged['party_type2'])\n","\n","# right-wing populist incumbent leader\n","merged[\"party_type2\"] = np.where((merged['pop_treat_right2'] == 1) & \\\n","                                 (merged['pop_treat_left2'] == 0) & \\\n","                                 (merged['incumbent_leader2'] == 0), \\\n","                                 \"party with right-wing populist incumbent leader\", merged['party_type2'])\n","\n","# left-wing populist incumbent leader\n","merged[\"party_type2\"] = np.where((merged['pop_treat_right2'] == 0) & \\\n","                                 (merged['pop_treat_left2'] == 1) & \\\n","                                 (merged['incumbent_leader2'] == 0), \\\n","                                 \"party with left-wing populist incumbent leader\", merged['party_type2'])"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"U7S-bg2ctaKv"},"outputs":[],"source":["# rescale cosine similarity to vary between 0 and 1\n","cos_multicult = merged['cos_multicult']\n","cos_multicult = (cos_multicult-cos_multicult.min())/(cos_multicult.max()-cos_multicult.min())\n","merged['cos_multicult'] = cos_multicult\n","\n","cos_combo = merged['cos_combo']\n","cos_combo = (cos_combo-cos_combo.min())/(cos_combo.max()-cos_combo.min())\n","merged['cos_combo'] = cos_combo\n","\n","# How to deal with cases where manifesto does not make any statements in a given category?\n","# --> if one party talks about an issue while the other does not: maximally differentiated /  --> cos sim = 0\n","# --> if neither party talks about it: not differentiated at all / identical approach --> cos sim = 1\n","merged[\"cos_multicult\"] = np.where((merged['miss_multicult1'] == 1) & \\\n","                                     (merged['miss_multicult2'] == 1), \\\n","                                      1, merged['cos_multicult'])\n","\n","merged[\"cos_combo\"] = np.where((merged['miss_combo1'] == 1) & \\\n","                                     (merged['miss_combo2'] == 1), \\\n","                                      1, merged['cos_combo'])\n","\n","merged[\"cos_multicult\"] = np.where(((merged['miss_multicult1'] == 1) & (merged['miss_multicult2'] == 0)) | \\\n","                                     ((merged['miss_multicult1'] == 0) & (merged['miss_multicult2'] == 1)), \\\n","                                      0, merged['cos_multicult'])\n","\n","merged[\"cos_combo\"] = np.where(((merged['miss_combo1'] == 1) & (merged['miss_combo2'] == 0)) | \\\n","                                     ((merged['miss_combo1'] == 0) & (merged['miss_combo2'] == 1)), \\\n","                                      0, merged['cos_combo'])\n","\n"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"b_OcQWKaeTcS"},"outputs":[],"source":["# convert cosine similarity to cosine distance (to align with position measures)\n","merged[\"cos_multicult\"] = 1 - merged[\"cos_multicult\"]\n","merged[\"cos_combo\"] = 1 - merged[\"cos_combo\"]\n","\n","merged = merged[[\"countryname\", \"date\", \"year\", \"rile_country_mean\", \\\n","                 \"party1\", \"party_type1\", \"partyname1\", \"parfam1\", \"pervote1\", \"rile1\", \\\n","                 \"party2\", \"party_type2\", \"partyname2\", \"parfam2\", \"pervote2\", \"rile2\", \\\n","                 \"diff_multicult\", \"diff_combo\", \"multicult1\", \"multicult2\", \"combo1\", \"combo2\", \\\n","                 \"cos_multicult\", \"cos_combo\"]]\n","\n","merged = merged.sort_values(by=['countryname', 'date', 'party1'])\n","\n","# save to disk\n","merged.to_stata('~/manifestos/distance_data.dta', version=118)"]}],"metadata":{"colab":{"authorship_tag":"ABX9TyOHoQlKHgkco+q0Em4U2u6A","provenance":[{"file_id":"1xk0dzNVtn37pnn0R9r-TGNWonW1eS-C5","timestamp":1684831223377},{"file_id":"1hHYukMSws777vWhImJd6l80n_3pE_BqX","timestamp":1684485132137},{"file_id":"1jX3khvM5Q9M7xd-c50yefPwJS-BI3xrK","timestamp":1684390167556},{"file_id":"13f9ueU8KdeidCbTJ5RgegLeGGOIdyKTY","timestamp":1684302769764},{"file_id":"11l0nN6YqzWJHN2KQHPsztztYbl8eyAdz","timestamp":1684161524757},{"file_id":"16Xy1RY4oVz4HN6ofuOLLrKvAOBNVkC17","timestamp":1683614188215}]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":0}
